#!/bin/bash
# (C) Copyright IBM Corp. 2018 All Rights Reserved.

# Clear out replicated postgres transaction ids so that they can be reset downwards.
# Run this script with the -check option on each replication node, then follow instructions.
# (more detail in attached technote).

# Initialization
nargs=$#
script=${0##*/}
eval `NZENV_ARG0="$0" NZENV_OUTPUT=-a nzenv "$@"`
NZSQL=$NZ_KIT_DIR/bin/nzsql 
sqlargs="-admin -db system"
NZHOSTBACKUP=$NZ_KIT_DIR/bin/nzhostbackup
SENDMAIL=$NZ_KIT_DIR/sbin/sendMail
hostBackupFile=/tmp/$script.hostbackup.tar.gz
TMPLOG=/tmp/$script.tmp
force=0
curTxId=0
bufferTxId=0
nodeCnt=0
warning=2000000000

# email address to notify in-case of PG XID overflow
mailTo=""

# Verify NZSQL was a success.
checkSqlFailure ()
{
output=$1
if [ -e $output ] ; then
    var=`cat $output | egrep "ERROR|FATAL" | wc -l`
    if [ $var -gt 0 ] ; then
        echo "Encountered error while running nzsql, please fix the error and rerun the script."
        echo "Refer to $output for more details."
        exit 1
    else
        truncate -s 0 $output
    fi
fi
}

# Do nothing if we're not a replication node, otherwise set nodeName and setName.
checkReplication ()
{
    nodeName=`$NZSQL $sqlargs -t -c "select DEFVALUE from _t_systemdef where DEFOPTION='REPLICATION NODE_NAME'"`
    if [ "$nodeName" = " " ] ; then
        echo "Not a replication node, nothing to do."
        exit 2
    fi
    setName=`$NZSQL $sqlargs -t -c "select SET_NAME from _T_REPL_SET"`
}

# Fetch the current PG XID, report 'buffer' remaining before 2^31-1.
getXid ()
{
    checkReplication

    # Get the current Max Postgres TxId
    curTxId=`$NZSQL $sqlargs -t -c "select max(pgxid) from _t_pgtx"`

    # Also show current 'buffer'.
    bufferTxId=`$NZSQL $sqlargs -t -c "select 2147483647 - max(pgxid) buffer from _t_pgtx"`
    echo "System has $bufferTxId postgres transaction ids remaining before replication may be unable to continue."
}

# Check the current PG XID
checkXid ()
{
getXid

# Check if pgxid > 2 billion (2^31 - 145M).
if [ $curTxId -lt $warning ] ; then
    echo "Current postgres transaction id $curTxId is below the warning limit $warning."
    if [ $force -eq 0 ] ; then
        exit 1
    fi
else
    $SENDMAIL                                                                               \
    -dst "$mailTo"                                                                          \
    -msg "Current postgres transaction id $curTxId exceeds the warning limit $warning."     \
    -bodyText "Please run '$script -check' on replication node $setName.$nodeName to verify current status, and follow instructions."
fi
}

resetCatalog ()
{
# Check the PG XID status.
checkXid

# Show current sync state
$NZSQL $sqlargs -c "select SETID, SET_NAME, NODEID, NODE_NAME, NODE_ROLE, NODE_STATE, NODE_CSN from _v_replication_sync"

# Alter replication
echo "Suspending replication on current node."
echo ""
$NZSQL $sqlargs -c "alter replication node $setName.$nodeName state suspended" >$TMPLOG 2>&1

checkSqlFailure "$TMPLOG"

# Get the hostbackup.
echo "Writing host catalog backup to $hostBackupFile."
$NZHOSTBACKUP -g 600 -D $NZ_DATA_DIR --skip_repl_check --nokeydb $hostBackupFile
retVal=$?
if [ $retVal -ne 0 ]; then
    echo "nzhostbackup failed, cannot proceed without a catalog backup."
    exit 1
fi

# Truncate replication tables
echo ""
echo "Truncating selected replication catalog tables to clear replicated transaction ids."
echo ""
$NZSQL -admin -db system >$TMPLOG 2>&1 <<EOF
truncate table _t_repl_commit_log;
truncate table _t_repl_capture_by_value_load;
truncate table _t_repl_capture_dbaccess;
truncate table _t_repl_capture_extra;
truncate table _t_repl_capture_invis;
truncate table _t_repl_capture_sequence;
truncate table _t_repl_capture_tableaccess;
truncate table _t_repl_external_file;
truncate table _t_tmp_repl_node;
truncate table _t_tmp_repl_set;
truncate table _t_tmp_repl_set_node;
truncate table _t_tmp_repl_set_node_v1;
truncate table _t_tmp_repl_state;
EOF

checkSqlFailure "$TMPLOG"

# Vacuum
echo ""
echo "Running a vacuum on system catalog."
echo ""
$NZSQL -q $sqlargs -c "vacuum"

echo ""
echo "You should next re-upgrade the postgres catalog (which will involve a 'nzstop'),"
echo "using the following command (as root) -"
echo "'$NZ_KIT_DIR/sbin/nzupgrade -r -T catupgrade upgrade DOCATUPGRADE=TRUE'"
echo "After which you should 'nzstart' the system and run '$script -resume' to Activate replication on current node."
}

# Resume the system after catalog upgrade
resumeReplication ()
{
# Check the current Max Postgres TxId
getXid

echo ""
echo "Current Postgres XID:$curTxId"
echo ""

if [ $curTxId -ge $warning ] ; then
    echo "ERROR: Current postgres transaction id $curTxId exceeds the warning limit $warning."
    exit 1
fi

# Alter replication
echo "Activating replication on current node."
echo ""
$NZSQL -admin -db system -c "alter replication node $setName.$nodeName state active" >$TMPLOG 2>&1

checkSqlFailure "$TMPLOG"

# Show current sync state
$NZSQL -admin -db system -c "select SETID, SET_NAME, NODEID, NODE_NAME, NODE_ROLE, NODE_STATE, NODE_CSN from _v_replication_sync"
}

#Process command-line options
OPTION=$1
case $OPTION in
    "-check" )
        if [ "$2" = "-sendmail" ] ; then
            if [ ! -z "$3" ] ; then
                mailTo=$3
                shift
            else
                echo "ERROR: No mailTo address specified with -sendmail."
                exit 1
            fi
            shift
        fi
        shift
        checkXid
        if [ $curTxId -ge $warning ] ; then        
            echo ""
            echo "Please run or schedule a convenient time to run"
            echo "'$script -reset [hostBackupFile]' to reset the postgres transaction id."
        fi
        ;;

    "-reset" )
        if [ "$2" = "-force" ] ; then
            force=1
            if [ ! -z "$3" ] ; then
                hostBackupFile=$3
                shift
            fi
            shift
        elif [ "$3" = "-force" ] ; then
            force=1
            if [ ! -z "$2" ] ; then
                hostBackupFile=$2
            fi
            shift
            shift
        elif [ ! -z "$2" ] ; then
            hostBackupFile=$2
            shift
        fi
        shift
        resetCatalog
        ;;

    "-resume" )
        shift
        resumeReplication
        ;;            

    "-h" )
        shift
        echo "Usage: $script { -check [-sendmail mailTo] | -reset [hostBackupFile] | -resume | -h }"
        ;;

    *)
        echo "ERROR: An unknown/invalid option was specified '$1'"
        echo "Usage: $script { -check [-sendmail mailTo] | -reset [hostBackupFile] | -resume | -h }"
        exit 1
        ;;
esac

# Ignore extra/multiple arguments.
if [ ! -z $1 ] ; then
    echo "Ignoring extra arguments '$@'"
fi
